library(ggplot2)
library(tree)
library(randomForest)
library(caret)
library(dplyr)
library(ggpubr)
library(plotly)
library(e1071)
library(recipes)
library(forcats)
library(rpart)
library(rpart.plot)
library(pROC)
library(boot)
autism <- read.csv("Autism_data.csv", na = "?")
dim(autism)
## [1] 704 21
summary(autism)
## A1_Score A2_Score A3_Score A4_Score
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.7216 Mean :0.4531 Mean :0.4574 Mean :0.4957
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## A5_Score A6_Score A7_Score A8_Score
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :1.0000
## Mean :0.4986 Mean :0.2841 Mean :0.4176 Mean :0.6491
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## A9_Score A10_Score age gender
## Min. :0.0000 Min. :0.0000 Min. : 17.0 Length:704
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 21.0 Class :character
## Median :0.0000 Median :1.0000 Median : 27.0 Mode :character
## Mean :0.3239 Mean :0.5739 Mean : 29.7
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.: 35.0
## Max. :1.0000 Max. :1.0000 Max. :383.0
## NA's :2
## ethnicity jundice austim country_of_residence
## Length:704 Length:704 Length:704 Length:704
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## used_app_before result age_cat relation
## Length:704 Min. : 0.000 Length:704 Length:704
## Class :character 1st Qu.: 3.000 Class :character Class :character
## Mode :character Median : 4.000 Mode :character Mode :character
## Mean : 4.875
## 3rd Qu.: 7.000
## Max. :10.000
##
## Class.ASD
## Length:704
## Class :character
## Mode :character
##
##
##
##
length(which(autism$age > 90))
## [1] 1
#View(autism)
From the summary above we can see that:
- Many predictors should be assigned to their correct type.
- The Max of age is 383 which is very illogical, so a small check was performed to see how many participants had an age > 90, and it appears that only one person had a mistake, being that of the 383.
columns = c(1,2,3,4,5,6,7,8,9,10,12,13,14,15,16,17,19,20,21)
autism[,columns] = as.data.frame(lapply(autism[,columns],as.factor))
summary(autism)
## A1_Score A2_Score A3_Score A4_Score A5_Score A6_Score A7_Score A8_Score
## 0:196 0:385 0:382 0:355 0:353 0:504 0:410 0:247
## 1:508 1:319 1:322 1:349 1:351 1:200 1:294 1:457
##
##
##
##
##
## A9_Score A10_Score age gender ethnicity jundice
## 0:476 0:300 Min. : 17.0 f:337 White-European :233 no :635
## 1:228 1:404 1st Qu.: 21.0 m:367 Asian :123 yes: 69
## Median : 27.0 'Middle Eastern ': 92
## Mean : 29.7 Black : 43
## 3rd Qu.: 35.0 'South Asian' : 36
## Max. :383.0 (Other) : 82
## NA's :2 NA's : 95
## austim country_of_residence used_app_before result
## no :613 'United States' :113 no :692 Min. : 0.000
## yes: 91 'United Arab Emirates': 82 yes: 12 1st Qu.: 3.000
## 'New Zealand' : 81 Median : 4.000
## India : 81 Mean : 4.875
## 'United Kingdom' : 77 3rd Qu.: 7.000
## Jordan : 47 Max. :10.000
## (Other) :223
## age_cat relation Class.ASD
## '18 and more':704 'Health care professional': 4 NO :515
## Others : 5 YES:189
## Parent : 50
## Relative : 28
## Self :522
## NA's : 95
##
Here, above, we are assigning each predictor to its correct type. From the summary we can see what has been adjusted.
categorical=matrix(ncol=2, nrow=0)
colnames(categorical)=c("column", "class")
for (i in columns[-21])
{
m=matrix(ncol=2, nrow=length(which(autism[,i]==1)))
colnames(m)=c("column", "class")
m[,1]=i
m[,2]=as.vector(autism$Class.ASD[which(autism[,i]==1)])
categorical=rbind(categorical, m)
}
plot = ggplot(as.data.frame(categorical), aes(column)) +
geom_bar(aes(fill=class)) +
theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank())
plot
Here we are looking at the distribution of the first ten binary variables between the two ASD classes. We can visualize from the figure above that every one of those factors occur in both classes, implying that they have some value in predicting either of the classes. Thus, they will all be kept in the data.
level = data.frame(sapply(autism[,columns], nlevels))
level$color = 0
colnames(level) = c("number_of_levels", "color")
level$color[which.max(level$number_of_levels)] = 1
level$color=as.factor(level$color)
variable=colnames(autism[,columns])
ggplot(data = level) +
geom_col(aes(x=variable, y=number_of_levels, fill= color), show.legend = FALSE) +
geom_text(aes(x=colnames(autism[,columns]), y=level$number_of_levels, label = number_of_levels), nudge_y = 0.03 , check_overlap = TRUE) +
theme(axis.text.x = element_text(angle = 40, hjust = 1))
## Warning: Use of `level$number_of_levels` is discouraged. Use `number_of_levels`
## instead.
The number of tiers per predictor is displayed. The plot above shows that the country_of_residence predictor has the most levels surpassing 32, which can cause problems when building trees; I have rectified this below. Also, it’s evident that the age_cat predictor only has one level, and it won’t assist us in predicting either class, so I’m going to eliminate it.
# Eliminating age_cat predictor...
autism=autism[,-19]
# Dealing with the country_of_residence predictor...
nb_of_obs = data.frame(tapply(autism$country_of_residence, autism$country_of_residence, length))
nb_of_obs$color = 0
colnames(nb_of_obs) = c("number_of_observations", "color")
nb_of_obs$color = as.factor(nb_of_obs$color)
countries = rownames(nb_of_obs)
ggplot(data = nb_of_obs) +
geom_col(aes(x=countries, y=number_of_observations, fill= color), show.legend = FALSE) +
geom_text(aes(x=countries, y=nb_of_obs$number_of_observations, label = number_of_observations), nudge_y = 1 , check_overlap = TRUE) +
theme(axis.text.x = element_text(angle = 50, hjust = 1))
## Warning: Use of `nb_of_obs$number_of_observations` is discouraged. Use
## `number_of_observations` instead.
The plot above vividly demonstrates that there are several levels with a small number of observations. These predictors will not be removed because they may be beneficial in the forecasts. Instead, I’ll merge all of the levels with fewer than five observations into a single level named “other” below.
# Combining all levels that have < 5 observations
others = rownames(nb_of_obs[which(nb_of_obs$number_of_observations<5),])
autism$country_of_residence = fct_collapse(autism$country_of_residence, other=others)
levels(autism$country_of_residence)
## [1] "other" "'New Zealand'" "'Sri Lanka'"
## [4] "'United Arab Emirates'" "'United Kingdom'" "'United States'"
## [7] "'Viet Nam'" "Afghanistan" "Australia"
## [10] "Brazil" "Canada" "France"
## [13] "India" "Iran" "Ireland"
## [16] "Italy" "Jordan" "Malaysia"
## [19] "Mexico" "Netherlands" "Russia"
variances = sapply(autism[,c(11,18)], var,na.rm = TRUE)
variances
## age result
## 272.496401 6.257468
standard_deviation = apply(autism[,c(11,18)], 2, sd, na.rm = TRUE)
standard_deviation
## age result
## 16.507465 2.501493
The large variation of the age data indicates that there are a few outliers that are drastically influencing the variance value. I’ll further investigate this.
plot_ly(data = autism, x = autism$age, type="scatter", mode="markers", color = autism$Class.ASD) %>%
layout (xaxis=list(title="Age"))
plot_ly(data = autism, x = autism$result, type="scatter", mode="markers", color = autism$Class.ASD) %>%
layout (xaxis=list(title="Total screening result"))
The first plot above demonstrates exactly what was stated before in the sense that there is a single abnormal outlier for the age attribute that is 383. This outlier belongs to the NO class of ASD. To deal with this, I will replace this abnormal value of age by the mean of the class it belongs to in the column.
The second plot above shows no abnormal values nor any outliers. Moreover, this plot illustrated that “results” predictor may be of significance in predicting the “Class/ASD”. Everything below 6 (including 6) suggests a “no” class of ASD, whereas everything over 6 predicts a “yes” class of ASD.
# Replacing abnormal values of age by the corresponding mean of the class that it belongs to within that column
index = which(autism$age==383)
class_no = which(autism$Class.ASD=="NO")
ind = which(class_no==index)
class_no = class_no[-ind]
autism$age[index] = mean(autism$age[class_no],na.rm=TRUE)
na_columns=unique(which(is.na(autism[,]),arr.ind=TRUE)[,2])
na_count=matrix(ncol=2,nrow=3)
for(i in 1:length(na_columns))
{
na_count[i,2]=length(which(is.na(autism[,na_columns[i]])))
na_count[i,1]=names(autism[na_columns[i]])
}
colnames(na_count)=c("Column","NA count")
na_count
## Column NA count
## [1,] "age" "2"
## [2,] "ethnicity" "95"
## [3,] "relation" "95"
Above are the columns that contain missing values.
- I will deal with the missing values in “age” by replacing them with the mean of the class that they belong to.
- I will deal with the missing values in the categorical variables by replacing them with the mode of the class of each of these variables.
# Replacing missing values in "age"
indicies = which(is.na(autism$age))
class = autism$Class.ASD[indicies]
class
## [1] NO NO
## Levels: NO YES
autism$age[indicies]=mean(autism$age[class_no],na.rm=TRUE)
As we can see, both NAs in the attribute “age” belong to the class NO of ASD; This is why I replaced them with the mean of that class.
# Replacing missing values in the categorical variables
calculate_mode = function(x)
{
uniqx = unique(na.omit(x))
uniqx[which.max(tabulate(match(x, uniqx)))]
}
class_yes = which(autism$Class.ASD == "YES")
mode_no_ethnicity = calculate_mode(autism$ethnicity[class_no])
mode_yes_ethnicity = calculate_mode(autism$ethnicity[class_yes])
mode_no_relation = calculate_mode(autism$relation[class_no])
mode_yes_relation = calculate_mode(autism$relation[class_yes])
na_ethnicity = which(is.na(autism$ethnicity))
for(i in 1:length(na_ethnicity))
{
if(autism$Class.ASD[na_ethnicity[i]] == "NO")
autism$ethnicity[na_ethnicity[i]] = mode_no_ethnicity
else
autism$ethnicity[na_ethnicity[i]] = mode_yes_ethnicity
}
na_relation = which(is.na(autism$relation))
for(i in 1:length(na_relation))
{
if(autism$Class.ASD[na_relation[i]] == "NO")
autism$relation[na_relation[i]] = mode_no_relation
else
autism$relation[na_relation[i]] = mode_yes_relation
}
The “result” variable has no outliers, as observed in prior analyses. However, there are several outliers in the age variable; And, when I plot the distribution of the outliers across the two classes, I discover that they all belong to the class “NO” of ASD, therefore they may be useful in forecasting that class, so I will leave them in rather than removing them.
boxplot(autism$result, col="light green")$out
## numeric(0)
age_outliers = boxplot(autism$age, col=" light green")$out
autism$age[age_outliers]
## [1] 19 43 44 43 20 32 20
boxplot_weight = boxplot(autism$age ~ autism$Class.ASD, data = autism, xlab = 'class', ylab = 'age', col=c("pink","light blue"))
cor(autism[,c(11,18)])
## age result
## age 1.00000000 0.09779626
## result 0.09779626 1.00000000
As we can see from the cor() function, the two continuous variables are NOT correlated at all, hence we can keep both of them in the data.
# Testing for normality
shapiro.test(autism$age)$p.value
## [1] 2.52974e-20
shapiro.test(autism$result)$p.value
## [1] 2.091149e-13
The age and result variables are both NOT normal, as indicated by the Shapiro test.
length(autism$Class.ASD)
## [1] 704
cat(round((length(which(autism$Class.ASD == "YES"))/length(autism$Class.ASD))*100),"% of class ASD is YES \n")
## 27 % of class ASD is YES
cat(round((length(which(autism$Class.ASD == "NO"))/length(autism$Class.ASD))*100),"% of class ASD is NO \n")
## 73 % of class ASD is NO
As we can see, the data is roughly 70:30, which means that the data is a bit more than slightly imbalanced; However, the imbalance is not big enough to cause many problems. Hence, I will not be dealing with this imbalance.
bootstrap1 = function(tree_data, output, predictors, m, percent)
{
n = nrow(tree_data)
index = c(1:n)
validation_nb = floor(n - (n*percent/100))
bagging = c()
tables = list()
accuracies = c()
worst_gini = c()
for(j in 1:m)
{
validation = sample(n, validation_nb, replace=F)
train70 = index[-validation]
train30 = sample(train70, validation_nb, replace=T)
train = c(train70, train30)
bagging = randomForest(Class.ASD~., tree_data, subset = train,
mtry = (ncol(tree_data)-1), importance =TRUE)
predictions = predict(bagging, newdata = tree_data[validation,])
tables[[j]] = table(predict = predictions, truth = output[validation])
accuracies[j] = (tables[[j]][1,1] + tables[[j]][2,2]) / sum(tables[[j]])
worst_gini[j] = which.min(bagging$importance[,4])
}
l = list(bagging,tables,accuracies,worst_gini)
return(l)
}
variables = colnames(autism[,-c(20)])
tree_data = autism
tree_accuracies = c()
rm_subset = c()
for(i in c(1:18))
{
r = bootstrap1(tree_data, tree_data$Class.ASD, variables, 100, 70)
tree_accuracies[i] = mean(r[[3]])
print(r[[1]]$importance)
rm_subset = r[[4]][1]
cat("\nThe predictor removed is: ")
print(colnames(tree_data[rm_subset]))
cat("\n")
tree_data=tree_data[,-rm_subset]
}
## NO YES MeanDecreaseAccuracy MeanDecreaseGini
## A1_Score 0.0000000 0.0000000 0.0000000 0.0000
## A2_Score 0.0000000 0.0000000 0.0000000 0.0000
## A3_Score 0.0000000 0.0000000 0.0000000 0.0000
## A4_Score 0.0000000 0.0000000 0.0000000 0.0000
## A5_Score 0.0000000 0.0000000 0.0000000 0.0000
## A6_Score 0.0000000 0.0000000 0.0000000 0.0000
## A7_Score 0.0000000 0.0000000 0.0000000 0.0000
## A8_Score 0.0000000 0.0000000 0.0000000 0.0000
## A9_Score 0.0000000 0.0000000 0.0000000 0.0000
## A10_Score 0.0000000 0.0000000 0.0000000 0.0000
## age 0.0000000 0.0000000 0.0000000 0.0000
## gender 0.0000000 0.0000000 0.0000000 0.0000
## ethnicity 0.0000000 0.0000000 0.0000000 0.0000
## jundice 0.0000000 0.0000000 0.0000000 0.0000
## austim 0.0000000 0.0000000 0.0000000 0.0000
## country_of_residence 0.0000000 0.0000000 0.0000000 0.0000
## used_app_before 0.0000000 0.0000000 0.0000000 0.0000
## result 0.2586816 0.7388217 0.3822679 269.8709
## relation 0.0000000 0.0000000 0.0000000 0.0000
##
## The predictor removed is: [1] "A1_Score"
##
## NO YES MeanDecreaseAccuracy MeanDecreaseGini
## A2_Score 0.0000000 0.0000000 0.0000000 0.0000
## A3_Score 0.0000000 0.0000000 0.0000000 0.0000
## A4_Score 0.0000000 0.0000000 0.0000000 0.0000
## A5_Score 0.0000000 0.0000000 0.0000000 0.0000
## A6_Score 0.0000000 0.0000000 0.0000000 0.0000
## A7_Score 0.0000000 0.0000000 0.0000000 0.0000
## A8_Score 0.0000000 0.0000000 0.0000000 0.0000
## A9_Score 0.0000000 0.0000000 0.0000000 0.0000
## A10_Score 0.0000000 0.0000000 0.0000000 0.0000
## age 0.0000000 0.0000000 0.0000000 0.0000
## gender 0.0000000 0.0000000 0.0000000 0.0000
## ethnicity 0.0000000 0.0000000 0.0000000 0.0000
## jundice 0.0000000 0.0000000 0.0000000 0.0000
## austim 0.0000000 0.0000000 0.0000000 0.0000
## country_of_residence 0.0000000 0.0000000 0.0000000 0.0000
## used_app_before 0.0000000 0.0000000 0.0000000 0.0000
## result 0.2733962 0.7253601 0.3961213 279.8257
## relation 0.0000000 0.0000000 0.0000000 0.0000
##
## The predictor removed is: [1] "A2_Score"
##
## NO YES MeanDecreaseAccuracy MeanDecreaseGini
## A3_Score 0.0000000 0.0000000 0.0000000 0.0000
## A4_Score 0.0000000 0.0000000 0.0000000 0.0000
## A5_Score 0.0000000 0.0000000 0.0000000 0.0000
## A6_Score 0.0000000 0.0000000 0.0000000 0.0000
## A7_Score 0.0000000 0.0000000 0.0000000 0.0000
## A8_Score 0.0000000 0.0000000 0.0000000 0.0000
## A9_Score 0.0000000 0.0000000 0.0000000 0.0000
## A10_Score 0.0000000 0.0000000 0.0000000 0.0000
## age 0.0000000 0.0000000 0.0000000 0.0000
## gender 0.0000000 0.0000000 0.0000000 0.0000
## ethnicity 0.0000000 0.0000000 0.0000000 0.0000
## jundice 0.0000000 0.0000000 0.0000000 0.0000
## austim 0.0000000 0.0000000 0.0000000 0.0000
## country_of_residence 0.0000000 0.0000000 0.0000000 0.0000
## used_app_before 0.0000000 0.0000000 0.0000000 0.0000
## result 0.2629775 0.7336872 0.3862847 273.2213
## relation 0.0000000 0.0000000 0.0000000 0.0000
##
## The predictor removed is: [1] "A3_Score"
##
## NO YES MeanDecreaseAccuracy MeanDecreaseGini
## A4_Score 0.0000000 0.0000000 0.0000000 0.0000
## A5_Score 0.0000000 0.0000000 0.0000000 0.0000
## A6_Score 0.0000000 0.0000000 0.0000000 0.0000
## A7_Score 0.0000000 0.0000000 0.0000000 0.0000
## A8_Score 0.0000000 0.0000000 0.0000000 0.0000
## A9_Score 0.0000000 0.0000000 0.0000000 0.0000
## A10_Score 0.0000000 0.0000000 0.0000000 0.0000
## age 0.0000000 0.0000000 0.0000000 0.0000
## gender 0.0000000 0.0000000 0.0000000 0.0000
## ethnicity 0.0000000 0.0000000 0.0000000 0.0000
## jundice 0.0000000 0.0000000 0.0000000 0.0000
## austim 0.0000000 0.0000000 0.0000000 0.0000
## country_of_residence 0.0000000 0.0000000 0.0000000 0.0000
## used_app_before 0.0000000 0.0000000 0.0000000 0.0000
## result 0.2885212 0.7177476 0.4106745 287.6028
## relation 0.0000000 0.0000000 0.0000000 0.0000
##
## The predictor removed is: [1] "A4_Score"
##
## NO YES MeanDecreaseAccuracy MeanDecreaseGini
## A5_Score 0.0000000 0.0000000 0.0000000 0.0000
## A6_Score 0.0000000 0.0000000 0.0000000 0.0000
## A7_Score 0.0000000 0.0000000 0.0000000 0.0000
## A8_Score 0.0000000 0.0000000 0.0000000 0.0000
## A9_Score 0.0000000 0.0000000 0.0000000 0.0000
## A10_Score 0.0000000 0.0000000 0.0000000 0.0000
## age 0.0000000 0.0000000 0.0000000 0.0000
## gender 0.0000000 0.0000000 0.0000000 0.0000
## ethnicity 0.0000000 0.0000000 0.0000000 0.0000
## jundice 0.0000000 0.0000000 0.0000000 0.0000
## austim 0.0000000 0.0000000 0.0000000 0.0000
## country_of_residence 0.0000000 0.0000000 0.0000000 0.0000
## used_app_before 0.0000000 0.0000000 0.0000000 0.0000
## result 0.2589874 0.7404232 0.3827623 269.7604
## relation 0.0000000 0.0000000 0.0000000 0.0000
##
## The predictor removed is: [1] "A5_Score"
##
## NO YES MeanDecreaseAccuracy MeanDecreaseGini
## A6_Score 0.0000000 0.0000000 0.0000000 0.0000
## A7_Score 0.0000000 0.0000000 0.0000000 0.0000
## A8_Score 0.0000000 0.0000000 0.0000000 0.0000
## A9_Score 0.0000000 0.0000000 0.0000000 0.0000
## A10_Score 0.0000000 0.0000000 0.0000000 0.0000
## age 0.0000000 0.0000000 0.0000000 0.0000
## gender 0.0000000 0.0000000 0.0000000 0.0000
## ethnicity 0.0000000 0.0000000 0.0000000 0.0000
## jundice 0.0000000 0.0000000 0.0000000 0.0000
## austim 0.0000000 0.0000000 0.0000000 0.0000
## country_of_residence 0.0000000 0.0000000 0.0000000 0.0000
## used_app_before 0.0000000 0.0000000 0.0000000 0.0000
## result 0.2784696 0.7226947 0.4010026 280.8792
## relation 0.0000000 0.0000000 0.0000000 0.0000
##
## The predictor removed is: [1] "A6_Score"
##
## NO YES MeanDecreaseAccuracy MeanDecreaseGini
## A7_Score 0.0000000 0.0000000 0.0000000 0.000
## A8_Score 0.0000000 0.0000000 0.0000000 0.000
## A9_Score 0.0000000 0.0000000 0.0000000 0.000
## A10_Score 0.0000000 0.0000000 0.0000000 0.000
## age 0.0000000 0.0000000 0.0000000 0.000
## gender 0.0000000 0.0000000 0.0000000 0.000
## ethnicity 0.0000000 0.0000000 0.0000000 0.000
## jundice 0.0000000 0.0000000 0.0000000 0.000
## austim 0.0000000 0.0000000 0.0000000 0.000
## country_of_residence 0.0000000 0.0000000 0.0000000 0.000
## used_app_before 0.0000000 0.0000000 0.0000000 0.000
## result 0.2864553 0.7151473 0.4080145 285.239
## relation 0.0000000 0.0000000 0.0000000 0.000
##
## The predictor removed is: [1] "A7_Score"
##
## NO YES MeanDecreaseAccuracy MeanDecreaseGini
## A8_Score 0.0000000 0.0000000 0.0000000 0.0000
## A9_Score 0.0000000 0.0000000 0.0000000 0.0000
## A10_Score 0.0000000 0.0000000 0.0000000 0.0000
## age 0.0000000 0.0000000 0.0000000 0.0000
## gender 0.0000000 0.0000000 0.0000000 0.0000
## ethnicity 0.0000000 0.0000000 0.0000000 0.0000
## jundice 0.0000000 0.0000000 0.0000000 0.0000
## austim 0.0000000 0.0000000 0.0000000 0.0000
## country_of_residence 0.0000000 0.0000000 0.0000000 0.0000
## used_app_before 0.0000000 0.0000000 0.0000000 0.0000
## result 0.2817673 0.7099195 0.4023817 285.5982
## relation 0.0000000 0.0000000 0.0000000 0.0000
##
## The predictor removed is: [1] "A8_Score"
##
## NO YES MeanDecreaseAccuracy MeanDecreaseGini
## A9_Score 0.0000000 0.0000000 0.0000000 0.0000
## A10_Score 0.0000000 0.0000000 0.0000000 0.0000
## age 0.0000000 0.0000000 0.0000000 0.0000
## gender 0.0000000 0.0000000 0.0000000 0.0000
## ethnicity 0.0000000 0.0000000 0.0000000 0.0000
## jundice 0.0000000 0.0000000 0.0000000 0.0000
## austim 0.0000000 0.0000000 0.0000000 0.0000
## country_of_residence 0.0000000 0.0000000 0.0000000 0.0000
## used_app_before 0.0000000 0.0000000 0.0000000 0.0000
## result 0.2391016 0.7598937 0.3627761 256.8998
## relation 0.0000000 0.0000000 0.0000000 0.0000
##
## The predictor removed is: [1] "A9_Score"
##
## NO YES MeanDecreaseAccuracy MeanDecreaseGini
## A10_Score 0.0000000 0.0000000 0.0000000 0.0000
## age 0.0000000 0.0000000 0.0000000 0.0000
## gender 0.0000000 0.0000000 0.0000000 0.0000
## ethnicity 0.0000000 0.0000000 0.0000000 0.0000
## jundice 0.0000000 0.0000000 0.0000000 0.0000
## austim 0.0000000 0.0000000 0.0000000 0.0000
## country_of_residence 0.0000000 0.0000000 0.0000000 0.0000
## used_app_before 0.0000000 0.0000000 0.0000000 0.0000
## result 0.2595835 0.7408002 0.3835321 270.9134
## relation 0.0000000 0.0000000 0.0000000 0.0000
##
## The predictor removed is: [1] "A10_Score"
##
## NO YES MeanDecreaseAccuracy MeanDecreaseGini
## age 0.0000000 0.0000000 0.0000000 0.0000
## gender 0.0000000 0.0000000 0.0000000 0.0000
## ethnicity 0.0000000 0.0000000 0.0000000 0.0000
## jundice 0.0000000 0.0000000 0.0000000 0.0000
## austim 0.0000000 0.0000000 0.0000000 0.0000
## country_of_residence 0.0000000 0.0000000 0.0000000 0.0000
## used_app_before 0.0000000 0.0000000 0.0000000 0.0000
## result 0.2858612 0.7111369 0.4067745 287.3955
## relation 0.0000000 0.0000000 0.0000000 0.0000
##
## The predictor removed is: [1] "age"
##
## NO YES MeanDecreaseAccuracy MeanDecreaseGini
## gender 0.0000000 0.0000000 0.0000000 0.0000
## ethnicity 0.0000000 0.0000000 0.0000000 0.0000
## jundice 0.0000000 0.0000000 0.0000000 0.0000
## austim 0.0000000 0.0000000 0.0000000 0.0000
## country_of_residence 0.0000000 0.0000000 0.0000000 0.0000
## used_app_before 0.0000000 0.0000000 0.0000000 0.0000
## result 0.2555902 0.7451746 0.3797131 267.7909
## relation 0.0000000 0.0000000 0.0000000 0.0000
##
## The predictor removed is: [1] "gender"
##
## NO YES MeanDecreaseAccuracy MeanDecreaseGini
## ethnicity 0.0000000 0.000000 0.0000000 0.0000
## jundice 0.0000000 0.000000 0.0000000 0.0000
## austim 0.0000000 0.000000 0.0000000 0.0000
## country_of_residence 0.0000000 0.000000 0.0000000 0.0000
## used_app_before 0.0000000 0.000000 0.0000000 0.0000
## result 0.2676748 0.732505 0.3910602 273.4422
## relation 0.0000000 0.000000 0.0000000 0.0000
##
## The predictor removed is: [1] "ethnicity"
##
## NO YES MeanDecreaseAccuracy MeanDecreaseGini
## jundice 0.0000000 0.0000000 0.0000000 0.0000
## austim 0.0000000 0.0000000 0.0000000 0.0000
## country_of_residence 0.0000000 0.0000000 0.0000000 0.0000
## used_app_before 0.0000000 0.0000000 0.0000000 0.0000
## result 0.2624038 0.7384004 0.3862465 272.0612
## relation 0.0000000 0.0000000 0.0000000 0.0000
##
## The predictor removed is: [1] "jundice"
##
## NO YES MeanDecreaseAccuracy MeanDecreaseGini
## austim 0.0000000 0.000000 0.0000000 0.0000
## country_of_residence 0.0000000 0.000000 0.0000000 0.0000
## used_app_before 0.0000000 0.000000 0.0000000 0.0000
## result 0.2724421 0.728604 0.3956087 280.2794
## relation 0.0000000 0.000000 0.0000000 0.0000
##
## The predictor removed is: [1] "austim"
##
## NO YES MeanDecreaseAccuracy MeanDecreaseGini
## country_of_residence 0.0000000 0.0000000 0.0000000 0.0000
## used_app_before 0.0000000 0.0000000 0.0000000 0.0000
## result 0.2555843 0.7415147 0.3792537 267.1194
## relation 0.0000000 0.0000000 0.0000000 0.0000
##
## The predictor removed is: [1] "country_of_residence"
##
## NO YES MeanDecreaseAccuracy MeanDecreaseGini
## used_app_before 0.0000000 0.0000000 0.0000000 0.0000
## result 0.2682702 0.7330063 0.3917995 276.1347
## relation 0.0000000 0.0000000 0.0000000 0.0000
##
## The predictor removed is: [1] "used_app_before"
##
## NO YES MeanDecreaseAccuracy MeanDecreaseGini
## result 0.2829299 0.7191797 0.4050972 284.0571
## relation 0.0000000 0.0000000 0.0000000 0.0000
##
## The predictor removed is: [1] "relation"
So, the resultant accuracies of each group of predictors that I obtained after deleting the poorest predictor at each cycle of the bagging approach are as follows:
tree_accuracies
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
Because the sole meaningful predictor in this case is the results, the accuracies are all identical, that being of 1. Because the accuracies are all the same, I’ll go with the model that only has the results predictor as the finest.
According to the bagging, results is a highly important predictor, but no other predictor appears to have an influence on the output class when results is present. Hence, I will attempt to repeat the bagging process excluding the results predictor in order to see the role of the other predictors in affecting the output, and also what is the second strongest subgroup of predictors:
temp_autism = autism[,-18]
variables = colnames(temp_autism[,-c(19)])
temp_tree_accuracies=c()
temp_rm_subset = c()
temp_rs = list()
for(i in c(1:17))
{
temp_r = bootstrap1(temp_autism, temp_autism$Class.ASD, variables,100,70)
temp_tree_accuracies[i] = mean(temp_r[[3]])
temp_rs[[i]] = temp_r[[1]]
temp_rm_subset = temp_r[[4]][1]
cat("\nThe predictor removed is: ")
print(colnames(temp_autism[temp_rm_subset]))
cat("\n")
temp_autism = temp_autism[,-temp_rm_subset]
}
##
## The predictor removed is: [1] "used_app_before"
##
##
## The predictor removed is: [1] "austim"
##
##
## The predictor removed is: [1] "gender"
##
##
## The predictor removed is: [1] "jundice"
##
##
## The predictor removed is: [1] "relation"
##
##
## The predictor removed is: [1] "A2_Score"
##
##
## The predictor removed is: [1] "A10_Score"
##
##
## The predictor removed is: [1] "A8_Score"
##
##
## The predictor removed is: [1] "A7_Score"
##
##
## The predictor removed is: [1] "A3_Score"
##
##
## The predictor removed is: [1] "ethnicity"
##
##
## The predictor removed is: [1] "A1_Score"
##
##
## The predictor removed is: [1] "A4_Score"
##
##
## The predictor removed is: [1] "A5_Score"
##
##
## The predictor removed is: [1] "A6_Score"
##
##
## The predictor removed is: [1] "country_of_residence"
##
##
## The predictor removed is: [1] "age"
The resultant accuracies of each collection of predictors are shown:
temp_tree_accuracies
## [1] 0.9153081 0.9123223 0.9150711 0.9154976 0.9150237 0.9154502 0.9118483
## [8] 0.9102370 0.9016114 0.8958294 0.8895261 0.8907109 0.8815640 0.8670142
## [15] 0.8414692 0.8354028 0.8251659
Without the results predictor, the optimal group of predictors is:
temp_rs[[which.max(temp_tree_accuracies)]]
##
## Call:
## randomForest(formula = Class.ASD ~ ., data = tree_data, mtry = (ncol(tree_data) - 1), importance = TRUE, subset = train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 15
##
## OOB estimate of error rate: 4.83%
## Confusion matrix:
## NO YES class.error
## NO 496 19 0.03689320
## YES 15 174 0.07936508
temp_rs[[which.max(temp_tree_accuracies)]]$importance
## NO YES MeanDecreaseAccuracy
## A1_Score 0.0055078146 0.0185450850 0.0090130319
## A2_Score 0.0029227902 0.0047787740 0.0034146803
## A3_Score 0.0079905531 0.0430279529 0.0174546061
## A4_Score 0.0061748681 0.0272238922 0.0118624986
## A5_Score 0.0189614761 0.0917167341 0.0384457157
## A6_Score 0.0473371936 0.0741518049 0.0544281661
## A7_Score 0.0150238807 0.0255617595 0.0178238590
## A8_Score 0.0046095655 0.0123962410 0.0067283738
## A9_Score 0.0582421467 0.1535077222 0.0837121704
## A10_Score 0.0043997341 0.0396820799 0.0139024736
## age 0.0116417560 0.0109680498 0.0114561683
## ethnicity 0.0100921034 0.0115813425 0.0105543605
## jundice 0.0045418794 0.0048944451 0.0046368775
## country_of_residence 0.0274743036 0.0771310140 0.0407952991
## relation 0.0006199142 0.0005913147 0.0006081766
## MeanDecreaseGini
## A1_Score 5.830429
## A2_Score 2.975989
## A3_Score 10.683991
## A4_Score 9.658705
## A5_Score 20.235389
## A6_Score 44.702861
## A7_Score 11.232146
## A8_Score 4.752372
## A9_Score 88.758447
## A10_Score 8.658888
## age 9.623791
## ethnicity 10.528875
## jundice 1.618404
## country_of_residence 44.992748
## relation 1.546257
However, the accuracy of the new group of predictors without the results predictor is less than that of the results predictor -> 0.9 less than 1. As a result, the best subset of the two is the one that includes only the result predictor.
Additionally, the following (Both with and without the results predictor) is another way to perform this model & it includes the Tree Plot for both as well:
# WITH RESULTS ATTRIBUTE
data = autism #to keep autism data set as backup.
sample_split <- function(dataset, split_ratio, seed = NULL) {
set.seed(seed, sample.kind="Rejection")
tr = dataset %>% slice_sample(prop=split_ratio)
#using anti-join to get test data
te = anti_join(dataset, tr, by = 'id')
#remove from tr and te the ID variable - not useful anymore
training = tr %>% dplyr::select(-id)
testing = te %>% dplyr::select(-id)
return(list(
train=training,
test=testing
))
}
data <- data %>% mutate(id=row_number())
data_split <- sample_split(data, 0.7, seed = 124)
#regression tree
data_tree <- rpart(formula = Class.ASD ~. , data = data_split$train, method = "class")
#predict
data_pred <- predict(object = data_tree, newdata = data_split$test)
#pruning
data_tree$cptable
## CP nsplit rel error xerror xstd
## 1 1.00 0 1 1 0.07008512
## 2 0.01 1 0 0 0.00000000
#cp here is the alpha in the slides (cost complexity pruning)
#we want to pick the one that will give us the smallest xerror, in this case it is the 1st. We must pick the number of splits based on the lowest xerror
#converting it to a data frame to make it easier to access columns
cp_table = data.frame(data_tree$cptable)
which.min(cp_table$xerror) #to give the CP that has the lowest error
## [1] 2
wm <- cp_table$CP[which.min(cp_table$xerror)]
min(cp_table$xerror) #to give the lowest error
## [1] 0
pruned <- prune(tree = data_tree, cp = wm) #this is not the only way
#plotting tree after prunning
rpart.plot(pruned)
As we can see, the results factor has a very significant role in determining whether a patient falls into the Yes / No classes of ASD; Hence, it is masking the role / effect of other predictors => Below is the Tree without the results predictor:
# WITHOUT RESULT ATTRIBUTE
data_noRESULT = autism[,-18] #to keep autism data set as backup.
sample_split <- function(dataset, split_ratio, seed = NULL) {
set.seed(seed, sample.kind="Rejection")
tr = dataset %>% slice_sample(prop=split_ratio)
#using anti-join to get test data
te = anti_join(dataset, tr, by = 'id')
#remove from tr and te the ID variable - not useful anymore
training = tr %>% dplyr::select(-id)
testing = te %>% dplyr::select(-id)
return(list(
train=training,
test=testing
))
}
data_noRESULT <- data_noRESULT %>% mutate(id=row_number())
data_noRESULT_split <- sample_split(data_noRESULT, 0.7, seed = 124)
#regression tree
data_tree_noRESULT <- rpart(formula = Class.ASD ~. , data = data_noRESULT_split$train, method = "class")
#predict
data_tree_noRESULT_pred <- predict(object = data_tree_noRESULT, newdata = data_noRESULT_split$test)
#pruning
data_tree_noRESULT$cptable
## CP nsplit rel error xerror xstd
## 1 0.45138889 0 1.0000000 1.0000000 0.07008512
## 2 0.12500000 1 0.5486111 0.5486111 0.05655142
## 3 0.04166667 2 0.4236111 0.5416667 0.05626035
## 4 0.02083333 3 0.3819444 0.4791667 0.05348718
## 5 0.01000000 7 0.2916667 0.4513889 0.05215856
#cp here is the alpha in the slides (cost complexity pruning)
#we want to pick the one that will give us the smallest xerror, in this case it is the 1st. We must pick the number of splits based on the lowest xerror
#converting it to a data frame to make it easier to access columns
cp_table = data.frame(data_tree_noRESULT$cptable)
which.min(cp_table$xerror) #to give the CP that has the lowest error
## [1] 5
wm <- cp_table$CP[which.min(cp_table$xerror)]
min(cp_table$xerror) #to give the lowest error
## [1] 0.4513889
pruned <- prune(tree = data_tree_noRESULT, cp = wm) #this is not the only way
#plot tree
rpart.plot(pruned)
As expected, we can now observe more decision nodes inside of the tree since we no longer have the masking effect of the results predictor.
bootstrap2 = function(data, output, predictors, m, percent,mtry)
{
n = nrow(data)
index = c(1:n)
validation_nb = floor(n-(n*percent/100))
rfs = list()
accuracies = c()
tables = list()
for(j in 1:m)
{
validation = sample(n, validation_nb, replace=F)
train70 = index[-validation]
train30 = sample(train70, validation_nb, replace=T)
train = c(train70,train30)
rfs[[j]] = randomForest(Class.ASD~. , data[train,], mtry=mtry)
predictions = predict(rfs[[j]], newdata = data[validation,])
tables[[j]] = table(predict=predictions, truth=output[validation])
accuracies[j] = (tables[[j]][1,1]+tables[[j]][2,2])/sum(tables[[j]])
}
l = list(rfs,accuracies,tables)
return(l)
}
mtry = sqrt(ncol(autism))
variables = colnames(autism[,-c(20)])
rff = bootstrap2(autism, autism$Class.ASD, variables, 100, 70, mtry)
accuracyy = mean(rff[[2]])
accuracyy
## [1] 1
As shown here, the Random Forest Bootstrap accuracy is 1.
I will only run the bootstrap 50 times for the SVM models as it is too computationally costly and takes too long.
bootstrap3 = function(data, output, predictors, m, percent)
{
n = nrow(data)
index = c(1:n)
validation_nb = floor(n-(n*percent/100))
models = list()
best_models = list()
tables = list()
costs = c()
accuracies = c()
for(j in 1:m)
{
validation = sample(n, validation_nb, replace=F)
train70 = index[-validation]
train30 = sample(train70, validation_nb, replace=T)
train = c(train70,train30)
models[[j]] = tune(svm,
Class.ASD~.,
data = data,
kernel = "linear",
ranges =list(cost = c(0.001,0.01,0.1,1,2,5,10,50,100)),
scale = FALSE)
best_models[[j]] = models[[j]]$best.model
costs[j] = best_models[[j]]$cost
predictions = predict(best_models[[j]], data[validation,])
tables[[j]] = table(predict = predictions, truth = output[validation])
accuracies[j] = (tables[[j]][1,1] + tables[[j]][2,2])/sum(tables[[j]])
}
l = list(models, tables, accuracies, costs)
return(l)
}
variables = colnames(autism[,-c(20)])
result = bootstrap3(autism, autism$Class.ASD, variables, 50, 70)
accuracy = mean(result[[3]])
For the linear kernel, the accuracy and best cost value, respectively, are as follows:
accuracy
## [1] 1
cost = mean(result[[4]])
cost
## [1] 0.1
bootstrap4 = function(data, output, predictors, m, percent, degree)
{
n = nrow(data)
index = c(1:n)
validation_nb = floor(n-(n*percent/100))
models = list()
best_models = list()
tables = list()
accuracies = c()
gammas = c()
costs = c()
for(j in 1:m)
{
validation = sample(n, validation_nb, replace=F)
train70 = index[-validation]
train30 = sample(train70, validation_nb, replace=T)
train = c(train70,train30)
models[[j]] = tune(svm,
Class.ASD~.,
data = data,
kernel = "polynomial",
degree = degree,
ranges = list(cost = c(0.001,0.01,0.1,1,10,100),
gamma=c(0.1,0.5,1,2,3,4)),
scale=FALSE)
best_models[[j]] = models[[j]]$best.model
costs[j] = best_models[[j]]$cost
gammas[j] = best_models[[j]]$gamma
predictions = predict(best_models[[j]], data[validation,])
tables[[j]] = table(predict = predictions, truth = output[validation])
accuracies[j] = (tables[[j]][1,1] + tables[[j]][2,2])/sum(tables[[j]])
}
l = list(models,tables,accuracies,costs,gammas)
return(l)
}
accuracies = c()
costs = c()
gammas = c()
for(i in c(2:5)) # to check for different degrees
{
results = bootstrap4(autism, autism$Class.ASD, variables, 50, 70, i)
accuracies[i-1] = mean(results[[3]])
costs[i-1] = mean(results[[4]])
gammas[i-1] = mean(results[[5]])
}
For the polynomial kernels, the accuracy, best cost, and gamma values for the used degrees are:
cat("\nFor polynomial degree 2, the accuracy, best cost, & gamma values are: \n")
##
## For polynomial degree 2, the accuracy, best cost, & gamma values are:
accuracies[1]
## [1] 1
costs[1]
## [1] 0.38404
gammas[1]
## [1] 0.116
cat("\n")
cat("\nFor polynomial degree 3, the accuracy, best cost, & gamma values are: \n")
##
## For polynomial degree 3, the accuracy, best cost, & gamma values are:
accuracies[2]
## [1] 1
costs[2]
## [1] 0.00604
gammas[2]
## [1] 0.1
cat("\n")
cat("\nFor polynomial degree 4, the accuracy, best cost, & gamma values are: \n")
##
## For polynomial degree 4, the accuracy, best cost, & gamma values are:
accuracies[3]
## [1] 1
costs[3]
## [1] 0.001
gammas[3]
## [1] 0.158
cat("\n")
cat("\nFor polynomial degree 5, the accuracy, best cost, & gamma values are: \n")
##
## For polynomial degree 5, the accuracy, best cost, & gamma values are:
accuracies[4]
## [1] 1
costs[4]
## [1] 0.001
gammas[4]
## [1] 0.1
cat("\n")
bootstrap5=function(data, output, predictors, m, percent)
{
n = nrow(data)
index = c(1:n)
validation_nb = floor(n-(n*percent/100))
models = list()
best_models = list()
tables = list()
accuracies = c()
costs = c()
gammas = c()
for(j in 1:m)
{
validation = sample(n, validation_nb, replace=F)
train70 = index[-validation]
train30 = sample(train70, validation_nb, replace=T)
train = c(train70,train30)
models[[j]] = tune(svm,
Class.ASD~.,
data = data,
kernel = "radial",
ranges = list(cost=c(0.001,0.01,0.1,1,10,100),
gamma=c(0.1,0.5,1,2,3,4)),
scale=FALSE)
best_models[[j]] = models[[j]]$best.model
costs[j] = best_models[[j]]$cost
gammas[j] = best_models[[j]]$gamma
predictions = predict(best_models[[j]], data[validation,])
tables[[j]] = table(predict = predictions, truth = output[validation])
accuracies[j] = (tables[[j]][1,1] + tables[[j]][2,2])/sum(tables[[j]])
}
l = list(models,tables,accuracies,costs,gammas)
return(l)
}
result2 = bootstrap5(autism, autism$Class.ASD, variables, 50, 70)
accuracy2 = mean(result2[[3]])
For the radial kernel, the accuracy, best cost, and gamma values are as follows:
accuracy2
## [1] 1
cost=mean(result2[[4]])
cost
## [1] 8.74
gamma=mean(result2[[5]])
gamma
## [1] 0.1
The Radial kernel bootstrap accuracy rate is 1 with the best cost value equal to 9 and gamma value equal to 0.1.
accuraciess = data.frame(matrix(ncol = 0, nrow = 1))
accuraciess$tree = tree_accuracies[1]
accuraciess$random_forest = accuracyy
accuraciess$linear = accuracy
accuraciess$polynomial2 = accuracies[1]
accuraciess$polynomial3 = accuracies[2]
accuraciess$polynomial4 = accuracies[3]
accuraciess$polynomial5 = accuracies[4]
accuraciess$radial = accuracy2
accuraciess
## tree random_forest linear polynomial2 polynomial3 polynomial4 polynomial5
## 1 1 1 1 1 1 1 1
## radial
## 1 1
All of the models have the same accuracy. All of the svm models, as well as the random forest and tree models, have the same accuracy of one.
As we can see from all the model results, we obtained an accuracy of 1 on almost every model… which of course is not good as we can never truly achieve 100% accuracy on anything. Initially we might think this is due to an imbalanced data set; however, I tested that (in data visualization) and found that the class ASD has a 70:30 ratio which is hardly above slightly imbalanced and it is not enough to explain the 100% accuracy. Another reason could be due to bootstrapping that may have caused over-fitting of the train. Additionally, due to these results, I was not able to generate a proper ROC curve and get the AUC.